#Goal: Create the figure panels describing the Sox2 locus and the proof-of-principle hopping experiment.
To re-run this code, change the paths in ‘File paths’ to the correct location of datasets.
##Libraries Load the libraries and set the parameters.
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks BiocGenerics::combine()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(knitr)
library(RColorBrewer)
library(cowplot)##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
##
## Attaching package: 'GENOVA'
##
## The following object is masked from 'package:ggplot2':
##
## resolution
split_string <- function(vect,sep,N1,N2=N1){
library(stringr)
sapply(vect, function(X){
paste(str_split(X,sep)[[1]][N1:N2],collapse = sep)
},USE.NAMES = F)
}path_CTCF_sites = "/DATA/usr/v.franceschini/Workspaces/2024_01_MATHIAS_CTCF_PEAKS/VF240812_calling_CTCF_motifs_again/02_OUTPUTS/CTCF_sites/6764_2_ME_E2_CGATGT_S2_R1_001_peaks_motifs.bed"
path_CRE6_ints = "/DATA/projects/Sox2/GEO_collection/Tagmentation/mapped_integrations_short_neutral_insert.txt"
path_mm10_to_mm39 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm10ToMm39.over.chain"
path_RCMC_WT_file = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/CM20230718_GSM6281849_RCMC_BR1_merged_allCap_WT_mm39.merged.50.mcool"
IntsInsilicoDir_E2226 <- "/DATA/usr/m.eder/projects/Tn5_tagmentation/E2226/pipeline_results/mm10/hopping/insertions/"
AlleleMappingDataDir_E2226 <- "/DATA/usr/m.eder/projects/Tn5_tagmentation/E2226/pipeline_results/mm10/hopping/allelic_insertions/"# Location of enhancer / gene
enhancer <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34753415,
end = 34766401),
strand = "*")
#gene based on mm10, RefSeq annotation
Sox2_gene <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34649995,
end = 34652461),
strand = "+")
#launch pads
landingPad_23 <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34643960,
end = 34643962),
strand = "+")
# new analysis (based on mapping to mm10 only)
CTCF_mm10_new <- import.bed(path_CTCF_sites)
#filtering:
CTCF_mm10.chr3 <- CTCF_mm10_new[seqnames(CTCF_mm10_new)== 'chr3']
# add the missing site after the SCR
CTCF_mm10.chr3_extra = sort(c(CTCF_mm10.chr3,
GRanges(seqnames = "chr3",
IRanges(start = 34772210, end = 34772210),
strand = "+")),
ignore.strand = T)
prange_plot = c(32643960, 36643960) #+/- 2MB from landing padNow import directly from the table we prepared for GEO submission (contains all integration from E2226, already annotated)
# Change mapping quality to numeric
tib_E2226 <- tib_E2226 %>%
mutate(mapq_1 = as.numeric(mapq_1),
mapq_2 = as.numeric(mapq_2),
mapq_1 = replace_na(mapq_1, 0),
mapq_2 = replace_na(mapq_2, 0))
# Factors for chr
chromosomes <- paste0("chr", c(1:19, "X"))
tib_E2226 <- tib_E2226 %>%
mutate(chr = factor(chr, levels = chromosomes)) %>%
drop_na(chr)
tib_E2226 = tib_E2226 %>%
mutate(cell_line = "CRE6",
population = "ctrl") %>%
mutate(mapped_arms = case_when(read_count_1 == 0 ~ "rv_only",
read_count_2 == 0 ~ "fw_only",
read_count_1 > 0 & read_count_2 > 0 ~ "both_arms")) %>%
mutate(hopped = start != 34643961)
#Filter table
tib_E2226_filt = tib_E2226 %>%
filter(population == "ctrl" | read_count >= 2) %>% #require at least two reads except for ctrl
filter(!(start >= 34721183 & start <= 34721192)) %>% #remove contamination from LP8
filter(strand %in% c("+", "-")) #because we sometimes split by strand, I'd prefer to remove the ambiguous integrations from all analyses
#E2226 had 10 control reactions sequenced with separate indices, here we combine them to get comparable data to the other experiments (with 1 index pair for the control pool)
tib_E2226_comb = tib_E2226_filt %>%
group_by(cell_line, population, chr, start, end, seq, strand, experiment) %>%
summarize(read_count = sum(read_count),
read_count_1 = sum(read_count_1),
read_count_2 = sum(read_count_2)) %>%
mutate(sample = paste0(cell_line, "_", population)) %>%
mutate(mapped_arms = case_when(read_count_1 == 0 ~ "rv_only",
read_count_2 == 0 ~ "fw_only",
read_count_1 > 0 & read_count_2 > 0 ~ "both_arms")) %>%
mutate(hopped = start != 34643961)
kable(tib_E2226_comb %>% group_by(population, cell_line) %>% summarise(n = n()))| population | cell_line | n |
|---|---|---|
| ctrl | CRE6 | 4865 |
# download.file('https://hgdownload.soe.ucsc.edu/goldenPath/mm10/liftOver/mm10ToMm9.over.chain.gz',
# "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm10ToMm9.over.chain.gz")
# library(R.utils)
# gunzip("/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm10ToMm9.over.chain.gz")
chain_mm10_to_mm39 = import.chain(path_mm10_to_mm39)liftover the mm10 x-axis to mm39
#plotting range on mm39
plot_range_hiC = c(34.65E6,
34.9E6)
#annotations
annotation_gr_mm10 = c(Sox2_gene, enhancer)
names(annotation_gr_mm10) = c("Sox2", "SCR")
annotation_gr_mm39 = unlist(liftOver(annotation_gr_mm10, chain_mm10_to_mm39))
bed_tib_mm39 = as_tibble(annotation_gr_mm39)
# CTCF annotation
CTCF_ROI_mm10 = subsetByOverlaps(CTCF_mm10.chr3_extra, GRanges(seqnames = "chr3", IRanges(start = 30E6, end = 40E6)))
CTCF_ROI_mm39 = unlist(liftOver(CTCF_ROI_mm10, chain_mm10_to_mm39))
CTCF_ROI_mm39_tib = as_tibble(CTCF_ROI_mm39)
p = pyramid(exp = RCMC_1kb,
chrom = 'chr3',
colour = c(0, 1500),
start = plot_range_hiC[1],
end=plot_range_hiC[2])
p +
geom_rect(data = bed_tib_mm39, aes(xmin = start, xmax = end), ymin = -30E3, ymax = -10E3) +
geom_segment(data = CTCF_ROI_mm39_tib, aes(x = start, xend = start, col = strand), y = -30E3, yend = -10E3) on the 1kb resolution data (as in pyramid plot), 20kb window
ins_score_20 = insulation_score(RCMC_1kb,
window = 20,
norm_to = 'none')
#at mm39 34.9Mb a gap in the data starts (filtered out because there are not capture probes), the insulation score calculation does not 'understand' this, so any window that covers this will be wrong/weird
#therefore, I filter out the insulation score within 20kb (window size) from that gap:
filter(ins_score_20$insula_score, chrom == "chr3" ) %>%
filter(start <= 34.9E6 - 20E3) %>%
mutate(mid_bin = (start + end)/2) %>%
ggplot(aes(x = mid_bin, y = WT)) +
#add insulation score
geom_line() +
# annotate gene
geom_vline(xintercept = c(start(annotation_gr_mm39[1]), end(annotation_gr_mm39[1])), col = 'red') +
# annotate enhancer
geom_vline(xintercept = c(start(annotation_gr_mm39[2]), end(annotation_gr_mm39[2])), col = "#ffb000") +
#annotate CTCF sites
geom_rug(data = CTCF_ROI_mm39_tib, aes(x = start, col = strand), inherit.aes=F, sides = 't')+
# geom_vline(data = CTCF_ROI_mm39_tib, aes(xintercept = start, col = strand), linetype = 'dashed')+
#annotate the mm10 axis
geom_rug(data = as_tibble(mm39_axis), aes(x = start), inherit.aes = F) +
#layout
scale_x_continuous(breaks = scales::pretty_breaks(n = 6),
labels = scales::unit_format(scale = 1E-6, accuracy = 0.05, unit=NULL),
limits=plot_range_hiC, expand = c(0,0)) +
scale_y_continuous(limits = c(0, 500), expand = c(0,0)) +
labs(x = 'mid_bin on mm39', y = 'insulation_score') +
theme_classic() +
theme(legend.position = 'none')#Filter integrations for ctrl and being hopped
N_ints_plotted = tib_E2226_comb %>%
filter(population == "ctrl" & chr == "chr3" & hopped) %>%
filter(start >= prange_plot[1] & start <= prange_plot[2]) %>%
group_by(cell_line) %>%
summarise(count = n())
pHistogram_non_stranded_10kb =
ggplot(filter(tib_E2226_comb, cell_line == "CRE6" & population == "ctrl" & chr == "chr3" & hopped),
aes(x = start)) +
#plot data
geom_histogram(binwidth = 10000, fill = "#997a8d", alpha = 0.6)+
geom_density(aes(x = start, y = after_stat(count*10000))) +
# annotate enhancer
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = "#ffb000") +
# annotate gene
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') +
# annotate landing pad
geom_vline(xintercept = start(landingPad_23), col = 'black', linetype = "dotted") +
#layout:
theme_classic(base_size = 16) +
theme(legend.position = "bottom") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6),
labels = scales::unit_format(scale = 1E-6, accuracy = 0.1, unit=NULL),
limits=prange_plot, expand = c(0,0)) +
scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0,0.05))) +
labs(y = "Integrations/10kb", x = "genomic coordinates on chr3") +
geom_text(data = N_ints_plotted, inherit.aes = F, aes(label = paste0("n = ", count), x = Inf, y = Inf), vjust = 2, hjust = 1.5 ) +
# annotate("text", x = Inf, y = Inf, label = paste0("n = ", N_ints_plotted), vjust = 2, hjust = 2) +
ggtitle("Distribution of integrations")
pHistogram_non_stranded_10kb
## stranded
#How many integrations do we find in the plotted range
N_ints_plotted_str = tib_E2226_comb %>%
filter(population == "ctrl" & chr == "chr3" & hopped) %>%
filter(cell_line == "CRE6" & start >= prange_plot[1] & start <= prange_plot[2]) %>%
group_by(cell_line, strand) %>%
summarise(count = n())
# Stranded
pHistogram_stranded =
ggplot(filter(tib_E2226_comb, cell_line == "CRE6" & population == "ctrl" & chr == "chr3" & hopped),
aes(x = start)) +
#plot data
geom_histogram(binwidth = 10000, fill = "#997a8d", alpha = 0.6)+
geom_density(aes(x = start, y = after_stat(count*10000))) +
#facet
facet_grid(strand ~ .) +
# annotate enhancer
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = "#ffb000") +
# annotate gene
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') +
# annotate landing pad
geom_vline(xintercept = start(landingPad_23), col = 'black', linetype = "dotted") +
#layout:
theme_classic(base_size = 16) +
theme(legend.position = "bottom") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6),
labels = scales::unit_format(scale = 1E-6, accuracy = 0.1, unit=NULL),
limits=prange_plot, expand = c(0,0)) +
scale_y_continuous(limits = c(0, 40), expand = expansion(mult = c(0,0.05))) + #no space underneath histogram
geom_text(data = N_ints_plotted_str, inherit.aes = F, aes(label = paste0("n = ", count), x = Inf, y = Inf), vjust = 2, hjust = 1.5 ) +
labs(y = "Integrations/10kb", x = "genomic coordinates on chr3")
pHistogram_strandedmyCols_chr3 = c(chr3 = "tomato3", other_chr = "lightgrey")
colScale_chr3 <- scale_colour_manual(name = "chr_category", values = myCols_chr3, aesthetics = c("fill","colour"))
int_count_per_chr_tib = tib_E2226_comb %>%
filter(cell_line == "CRE6" & population == "ctrl" & hopped ) %>%
mutate(chr_category = case_when(chr == "chr3" ~ "chr3",
.default = "other_chr")) %>%
group_by(chr, chr_category) %>%
summarize(N = n()) %>%
ungroup() %>%
mutate(prop_chr = N/sum(N))
ggplot(int_count_per_chr_tib, aes(x = chr, y = prop_chr, fill = chr_category)) +
geom_col() +
theme_classic()+
colScale_chr3 +
geom_text( aes(label = N), hjust = -0.1, angle = 90) +
theme(legend.position = "none") +
scale_x_discrete(guide = guide_axis(angle = 90)) +
scale_y_continuous(expand = expansion(mult = c(0,0.1))) +
labs(x = "chromosome", y = 'fraction of integrations')filter(tib_E2226_comb, cell_line == "CRE6" & population == "ctrl") %>%
group_by(hopped) %>%
summarize(N_reads = sum(read_count)) %>%
mutate(fraction_of_reads = N_reads/sum(N_reads))## # A tibble: 2 × 3
## hopped N_reads fraction_of_reads
## <lgl> <dbl> <dbl>
## 1 FALSE 3340 0.327
## 2 TRUE 6885 0.673
The hopping pipeline outputs a file with allele calls per integrations. These are based on the integration locations in the in silico genome, so they don’t match the locations in our final integration tibble. I reload the integrations, now on the in silico genome and the allele calls, and match them (for the CRE6 sample only)
# List all files
metadata_insilico_E2226 <- tibble(file = dir(IntsInsilicoDir_E2226,
recursive = T, full.names = T)) %>%
filter(grepl(".txt", file)) %>%
mutate(sample = str_remove(basename(file), "\\..*"))
# Load all files
tib_insilico_E2226 <- bind_rows(lapply(1:nrow(metadata_insilico_E2226),
function(i) {
tmp <- read_tsv(metadata_insilico_E2226$file[i],
col_types = cols(
.default = col_character(),
start = col_double(),
end = col_double(),
# gap_concordance = col_double(),
read_count = col_double(),
mapq = col_double(),
read_count_1 = col_double(),
mapq_1 = col_character(),
read_count_2 = col_double(),
mapq_2 = col_character(),
# sump = col_double(),
# p_adj = col_double()
)) %>%
mutate(sample = metadata_insilico_E2226$sample[i])
}))
tib_insilico_E2226 <- tib_insilico_E2226 %>%
mutate(chr = factor(chr, levels = chromosomes)) %>%
drop_na(chr)
# Load the allele specific integration data into R
# List all files
metadata_allelic_E2226 <- tibble(file = dir(AlleleMappingDataDir_E2226,
recursive = T, full.names = T)) %>%
filter(grepl(".txt", file)) %>%
mutate(sample = str_remove(basename(file), "\\..*"))
# Load all files
tib_allelic_E2226 <- bind_rows(lapply(1:nrow(metadata_allelic_E2226),
function(i) {
tmp <- read_tsv(metadata_allelic_E2226$file[i],
col_types = cols(
.default = col_character(),
start = col_double(),
end = col_double(),
)) %>%
mutate(sample = metadata_allelic_E2226$sample[i])
}))
tib_allelic_E2226 <- tib_allelic_E2226 %>%
mutate(chr = factor(chr, levels = chromosomes)) %>%
drop_na(chr)The start and end in the allele call txt file correspond to the ‘region_start’ and ‘region_end’ in the integrations txt file (the ranges of the mapped reads). Join based on that.
joined_tib = tib_insilico_E2226 %>%
mutate(cell_line = str_remove(sample, "_.*$")) %>% #remove everything after the first _
filter(cell_line == "CRE6") %>%
mutate(region_start_num = case_when(region_start == "." ~ NA,
.default =as.numeric(region_start)),
region_end_num = case_when(region_end == "." ~ NA,
.default =as.numeric(region_end)),
) %>%
left_join(tib_allelic_E2226, by = c("chr","sample", "region_start_num" = "start","region_end_num" = "end")) %>%
mutate(population = split_string(sample, "_", 2,3)) %>%
mutate(pool = split_string(sample, "_", 4)) In this experiment we sequenced the 10 tagmentation libraries from the one sample with separate indices. For all the analyses we merged these 10 libraries (because they actually correspond to one pool of cells). Do the same for the allele calling. If an integration is found in multiple pools, I add up the 129S and CAST scores and determine the most likely allele based on that (if CAST>129S it is in the CAST allele, and vice versa, if equal the allele is ambiguous)
joined_tib_ctrl_pooled = joined_tib %>%
filter(population == "non_sorted") %>%
group_by(chr, start, strand, cell_line, population) %>%
mutate(CAST = as.numeric(CAST),
`129S` = as.numeric(`129S`)) %>%
summarize(read_count = sum(read_count),
read_count_1 = sum(read_count_1),
read_count_2 = sum(read_count_2),
CAST_sum = sum(CAST),
`129S_sum` = sum(`129S`),
N_Ambiguous = sum(call == "Ambiguous"),
N_CAST = sum(call == "CAST"),
N_129S = sum(call == "129S")
) %>%
mutate(highest_score = case_when(CAST_sum > `129S_sum` ~ "CAST",
CAST_sum < `129S_sum` ~ "129S",
CAST_sum == `129S_sum` ~ "Ambiguous"
))myCols_alleles = c(`129S` = "tomato3", Ambiguous = 'lightgrey', CAST = "lightgrey")
colScale_alleles <- scale_colour_manual(name = "allele", values = myCols_alleles, aesthetics = c("fill","colour"))
joined_tib_ctrl_pooled %>%
filter(chr == "chr3") %>%
group_by(highest_score) %>%
summarize(N = n())%>%
mutate(prop_allele = N/sum(N)) %>%
dplyr::rename(allele = highest_score) %>%
ggplot(aes(x = allele, y = prop_allele, fill = allele)) + #by is a grouping factor to make the stat_prop work
geom_col() +
geom_text(aes(label = N), nudge_y = 0.02) +
scale_y_continuous(limits = c(0,NA), expand = expansion(mult = c(0, 0.05))) +
theme_classic() +
labs(x = "allele", y = "fraction of integrations") +
ggtitle("integrations on chr3") +
theme(legend.position = "none") +
colScale_alleles# Caculate which fraction of integrations has which read count
tib_read_distr_fr =
tib_E2226_comb %>%
filter(cell_line == "CRE6" & population == "ctrl" & hopped) %>%
group_by(read_count) %>%
summarise(n_of_Int = n()) %>%
mutate(total_int = sum(n_of_Int),
fraction_of_int = n_of_Int/total_int)
#combine all read counts from 8 and up into one category
tib_read_distr_fr_grouped = tib_read_distr_fr %>%
mutate(read_count_grouped = case_when(read_count >= 8 ~ ">= 8",
.default = as.character(read_count))) %>%
mutate(read_count_grouped = factor(read_count_grouped, levels = c(as.character(1:8), ">= 8"))) %>%
group_by(read_count_grouped) %>%
summarize(n_of_Int = sum(n_of_Int),
fraction_of_int = sum(fraction_of_int))
#plot
tib_read_distr_fr_grouped %>%
ggplot(aes(x = read_count_grouped, y = fraction_of_int)) +
geom_bar(stat = "identity") +
labs(x = "Unique reads per integration",
y = "Fraction of integrations") +
scale_y_continuous(limits = c(0,NA), expand = expansion(mult = c(0, 0.05)))+
geom_text(aes(label = n_of_Int), nudge_y = 0.02) +
theme_bw(base_size = 14)